{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc = pyspark.SparkContext()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 直接读取本地数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "rdd = sc.textFile(\"./data/eq2013.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['date,time,x,y,z,lv,M,v1,v2,v3,v4',\n",
       " '2013/5/31,23:41:56,-113.408,37.175,6.6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,23:09:05,-113.411,37.178,6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,22:45:34,-113.413,37.172,4,2.9,ML2.9,SLC,,UTAH,',\n",
       " '2013/5/31,22:34:26,-113.414,37.174,3.2,2.8,ML2.8,SLC,,UTAH,']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8205"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 通过HDFS读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "rdd2 = sc.textFile(\"hdfs://sparkvm.com:8020/data/eq/eq2013.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['date,time,x,y,z,lv,M,v1,v2,v3,v4',\n",
       " '2013/5/31,23:41:56,-113.408,37.175,6.6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,23:09:05,-113.411,37.178,6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,22:45:34,-113.413,37.172,4,2.9,ML2.9,SLC,,UTAH,',\n",
       " '2013/5/31,22:34:26,-113.414,37.174,3.2,2.8,ML2.8,SLC,,UTAH,']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd2.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8205"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd2.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 通过Python自行读取进行序列化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "eq = [row[:-1] for row in open(\"./data/eq2013.csv\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['date,time,x,y,z,lv,M,v1,v2,v3,v4',\n",
       " '2013/5/31,23:41:56,-113.408,37.175,6.6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,23:09:05,-113.411,37.178,6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,22:45:34,-113.413,37.172,4,2.9,ML2.9,SLC,,UTAH,',\n",
       " '2013/5/31,22:34:26,-113.414,37.174,3.2,2.8,ML2.8,SLC,,UTAH,']"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eq[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "rdd3 = sc.parallelize(eq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['date,time,x,y,z,lv,M,v1,v2,v3,v4',\n",
       " '2013/5/31,23:41:56,-113.408,37.175,6.6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,23:09:05,-113.411,37.178,6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,22:45:34,-113.413,37.172,4,2.9,ML2.9,SLC,,UTAH,',\n",
       " '2013/5/31,22:34:26,-113.414,37.174,3.2,2.8,ML2.8,SLC,,UTAH,']"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd3.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8205"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd3.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 通过JDBC读取数据库转为RDD"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Spark主要通过spark.sql来实现数据库获取"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 首先把Postgresql的JDBC包，导入到环境变量中"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ['SPARK_CLASSPATH']=\\\n",
    "\"$SPARK_CLASSPATH;{0}/data/postgresql-42.2.4.jar\".format(os.getcwd())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PostgreSQL的JDBC连接字符串"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "url=\"jdbc:postgresql://sparkvm.com:5432/postgis?user=postgres&password=postgres\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark.sql"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "pyssql = pyspark.sql.SQLContext(sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 通过SQLContext来获取数据，需要设置读取模式、URL和读取的表格"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pyssql.read.format(\"jdbc\").\\\n",
    "option(\"url\", url).option(\"dbtable\",\"china\").load()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 显示前面5行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+------+------------+--------+--------+--------+--------+--------+---------+----------+----------+--------+--------+--------+--------+--------+--------+----------+\n",
      "| id|                geom|first_name|  code|        area|pop_2009|pop_2005|pop_2000|pop_1999|pop_1995| pop_1990|pop_birth_|pop_death_|gdp_2009|gdp_2008|gdp_2007|gdp_2006|gdp_2005|cpi_2009|categories|\n",
      "+---+--------------------+----------+------+------------+--------+--------+--------+--------+--------+---------+----------+----------+--------+--------+--------+--------+--------+--------+----------+\n",
      "|  1|0106000020E610000...|        北京|110000| 1.634556E10|  1755.0|  1538.0|  1382.0|  1257.0|  1251.0|1081.9407|      8.06|      5.12|12153.03| 11115.0| 9846.81| 8117.78| 6969.52| 22154.0|       1.0|\n",
      "|  2|0106000020E610000...|        天津|120000|1.1660963E10| 1228.16|  1043.0|  1001.0|   959.0|   942.0| 878.5402|       8.3|      6.23| 7521.85| 6719.01| 5252.76| 4462.74| 3905.64| 15149.0|       1.0|\n",
      "|  3|0106000020E610000...|        山西|140000|1.5727177E11| 3427.36|  3355.0|  3297.0|  3204.0|  3077.0|2875.9014|     10.87|      6.12| 7358.31|  7315.4| 6024.45| 4878.61| 4230.53|  6854.0|       1.0|\n",
      "|  7|0106000020E610000...|        上海|310000| 5.2556017E9|  1921.0|  1778.0|  1674.0|  1474.0|  1415.0|1334.1896|      8.64|      7.05|15046.45|14069.86|12494.01|10572.24| 9247.66| 29572.0|       2.0|\n",
      "|  4|0106000020E610000...|        辽宁|210000| 1.450149E11|  4319.0|  4221.0|  4238.0|  4171.0|  4092.0|3945.9697|      6.06|      6.15|15212.49|13668.58| 11164.3| 9304.52| 8047.26| 10848.0|       1.0|\n",
      "+---+--------------------+----------+------+------------+--------+--------+--------+--------+--------+---------+----------+----------+--------+--------+--------+--------+--------+--------+----------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df1.show(5)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
